In [1]:
import pandas as pd
train = pd.read_csv('/Users/ruohezhou/Documents/451project/train.csv')
valid = pd.read_csv('/Users/ruohezhou/Documents/451project/valid.csv')
print(len(train))
print(len(valid))
frames = [train, valid]
df = pd.concat(frames) #merge train and valid dataset into one dataset
print(len(df))
45000
15000
60000
In [2]:
train.head()
Out[2]:
Id Title Body Tags CreationDate Y
0 34552656 Java: Repeat Task Every Random Seconds <p>I'm already familiar with repeating tasks e... <java><repeat> 2016-01-01 00:21:59 LQ_CLOSE
1 34553034 Why are Java Optionals immutable? <p>I'd like to understand why Java 8 Optionals... <java><optional> 2016-01-01 02:03:20 HQ
2 34553174 Text Overlay Image with Darkened Opacity React... <p>I am attempting to overlay a title over an ... <javascript><image><overlay><react-native><opa... 2016-01-01 02:48:24 HQ
3 34553318 Why ternary operator in swift is so picky? <p>The question is very simple, but I just cou... <swift><operators><whitespace><ternary-operato... 2016-01-01 03:30:17 HQ
4 34553755 hide/show fab with scale animation <p>I'm using custom floatingactionmenu. I need... <android><material-design><floating-action-but... 2016-01-01 05:21:48 HQ
In [3]:
df.iloc[0, 2]
Out[3]:
'<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print "Hello World" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>\n'
In [4]:
#only run one time, otherwise run from the first chunk where df just got read and merged
df['Tags'] = df['Tags'].apply(lambda x: x[1:-1].split('><'))
df['Body'] = df['Body'].apply(lambda x: x[3:-6])
In [5]:
df.head()
Out[5]:
Id Title Body Tags CreationDate Y
0 34552656 Java: Repeat Task Every Random Seconds I'm already familiar with repeating tasks ever... [java, repeat] 2016-01-01 00:21:59 LQ_CLOSE
1 34553034 Why are Java Optionals immutable? I'd like to understand why Java 8 Optionals we... [java, optional] 2016-01-01 02:03:20 HQ
2 34553174 Text Overlay Image with Darkened Opacity React... I am attempting to overlay a title over an ima... [javascript, image, overlay, react-native, opa... 2016-01-01 02:48:24 HQ
3 34553318 Why ternary operator in swift is so picky? The question is very simple, but I just could ... [swift, operators, whitespace, ternary-operato... 2016-01-01 03:30:17 HQ
4 34553755 hide/show fab with scale animation I'm using custom floatingactionmenu. I need to... [android, material-design, floating-action-but... 2016-01-01 05:21:48 HQ
In [6]:
df['Y'].unique()
Out[6]:
array(['LQ_CLOSE', 'HQ', 'LQ_EDIT'], dtype=object)

WordCloud

给高质量和低质量的各画一幅wordcloud

In [6]:
df_high = df[df['Y'] == 'HQ']
df_high = df_high.drop(['Id', 'Tags', 'CreationDate'], axis=1)
df_high.head()
Out[6]:
Title Body Y
1 Why are Java Optionals immutable? I'd like to understand why Java 8 Optionals we... HQ
2 Text Overlay Image with Darkened Opacity React... I am attempting to overlay a title over an ima... HQ
3 Why ternary operator in swift is so picky? The question is very simple, but I just could ... HQ
4 hide/show fab with scale animation I'm using custom floatingactionmenu. I need to... HQ
8 Changing Theme in Windows 10 UWP App Programma... I was able to change theme using <code>this.Re... HQ
In [7]:
df_low = df[(df['Y'] == 'LQ_CLOSE') | (df['Y'] == 'LQ_EDIT')]
df_low = df_low.drop(['Id', 'Tags', 'CreationDate'], axis=1)
df_low.head()
Out[7]:
Title Body Y
0 Java: Repeat Task Every Random Seconds I'm already familiar with repeating tasks ever... LQ_CLOSE
5 Accessing pointer member of the structure usin... I have defined integer pointer inside the stru... LQ_CLOSE
6 How To Disable 2nd Saturday 4th Saturday ,Sund... want to Disable 2nd Saturday 4th saturday, Su... LQ_EDIT
7 Resizing containers in bootstrap m new to bootstrap and i would like to find ou... LQ_EDIT
9 TextBox Value Disappears VB6 y case I am having two form Form 1 and Form 2.... LQ_EDIT
In [8]:
import string
string.punctuation
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import copy

df_high_copy = copy.deepcopy(df_high)
df_low_copy = copy.deepcopy(df_low)

def clean(dataset):
    for i in range(len(dataset)):
        #string_cleaned = dataset.iloc[i, 1].replace('<p>', '').replace('</p>', '').replace('</a>', '')
        m = re.sub("[\(\[].*?[\)\]]", " ", dataset.iloc[i, 1])
        m1 = re.sub("<.*?>", " ", m)
        m2 = re.sub("\n", " ", m1)
        m3 = re.sub("{.*?}", " ", m2)
        m4 = re.sub("&gt", "", m3)
        string_cleaned = re.sub("&lt", "", m4)
        w = string_cleaned.split()
        resultwords = [word for word in w if word.lower() not in stop_words]
        resultwords1 = [word for word in resultwords if word not in punctuation]
        dataset.iloc[i, 1] = ' '.join(resultwords1)
    return dataset
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruohezhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [9]:
def wordcloud_array(dataset):
    comment_words = ''
    for idx in range(len(dataset)):
        ind = dataset.iloc[idx, 1].split()
        comment_words += " ".join(ind)+" "
    return comment_words

Words show up prequently in high-quality questions

In [252]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud (
    background_color = 'white',
    width = 800,
    height = 800,
    random_state = 123).generate(wordcloud_array(clean(df_high_copy)))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
plt.show()

Without cleaning data -- High quality questions

In [253]:
wordcloud_notclean = WordCloud (
    background_color = 'pink',
    width = 800,
    height = 800,
    random_state = 123).generate(wordcloud_array(df_high))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_notclean) # image show
plt.axis('off') # to off the axis of x and y
plt.show()

Words show up in low-quality questions

In [254]:
wordcloud_low = WordCloud (
    background_color = 'black',
    width = 800,
    height = 800,
    random_state = 123).generate(wordcloud_array(clean(df_low_copy)))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_low) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
In [255]:
wordcloud_low_notclean = WordCloud (
    colormap='RdYlGn',
    width = 800,
    height = 800,
    random_state = 123).generate(wordcloud_array(df_low))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_low_notclean) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
In [124]:
import string
string.punctuation
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english')) 
punctuation = set(string.punctuation)
for i in range(len(df_high)):
    string_cleaned = df_high.iloc[i, 1].replace('<p>', '').replace('</p>', '').replace('</a>', '')
    w = string_cleaned.split()
    resultwords = [word for word in w if word.lower() not in stop_words]
    resultwords1 = [word for word in resultwords if word not in punctuation]
    df_high.iloc[i, 1] = ' '.join(resultwords1)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruohezhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [125]:
comment_words = ''
for idx in range(len(df_high)):
    ind = df_high.iloc[idx, 1].split()
comment_words += " ".join(ind)+" " 
In [128]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud (
                    background_color = 'black',
                    width = 800,
                    height = 800
                        ).generate(comment_words)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()
In [51]:
### tags wordcloud
In [38]:
import numpy as np
label = []
for k in df.iloc[:, 3]:
    k = (','.join(k))
    label.append(k)
In [37]:
import numpy as np
label = []
for k in df.iloc[:, 3]:
    k = (','.join(k))
    label.append(k)
#label = list(np.unique(label)) #remove repetitive words

cloud = ''
for i in label:
    cloud += i + ' '
    
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud (
                    background_color = 'pink',
                    width = 800,
                    height = 800
                        ).generate(cloud)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()

Remove punctuation

In [8]:
import re
from itertools import chain
from nltk.tokenize import word_tokenize

list2 = []
for i in range(len(df)):
    list2.append(word_tokenize(re.sub(r'\W+', ' ',df.iloc[i, 2])))

wordlist = list(chain.from_iterable(list2))
In [9]:
#remove non-alphanumeric values
cleanlist = []
for k in wordlist:
    if k.isalnum() == True:
        cleanlist.append(k)    #len(wordlist) = 8501886, len(cleanlist) = 8356337
In [10]:
cloud1 = ''
for i in cleanlist:
    cloud1 += i + ' '
In [50]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
import matplotlib.pyplot as plt

wordcloud = WordCloud(
                    colormap='RdYlGn',
                    stopwords = stopwords,
                    background_color = 'white',
                    width = 800,
                    height = 800
                        ).generate(cloud1)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()

Encoding Y

In [10]:
data = df.drop(['Id', 'Tags', 'CreationDate'], axis=1)
mat = {'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2}
data['Y'] = data['Y'].map(mat)
data.head()
Out[10]:
Title Body Y
0 Java: Repeat Task Every Random Seconds I'm already familiar with repeating tasks ever... 0
1 Why are Java Optionals immutable? I'd like to understand why Java 8 Optionals we... 2
2 Text Overlay Image with Darkened Opacity React... I am attempting to overlay a title over an ima... 2
3 Why ternary operator in swift is so picky? The question is very simple, but I just could ... 2
4 hide/show fab with scale animation I'm using custom floatingactionmenu. I need to... 2
In [11]:
data['Y'].value_counts()
Out[11]:
2    20000
1    20000
0    20000
Name: Y, dtype: int64
In [24]:
import matplotlib.pyplot as plt
labels = ['High Quality Questions', 'Low Quality Question - Close', 'Low Quality Question - Edit']
values = [len(data[data['Y'] == 2]), len(data[data['Y'] == 0]), len(data[data['Y'] == 1])]
#plt.figure(figsize=(16, 9))
plt.pie(x=values, labels=labels, autopct="%1.1f%%", colors = ['pink', 'olive', 'cyan'])
plt.title("Y Value Distribution")
plt.show()

The number of questions with different qualities is evenly distributed

The relation between the length of titles and questions and the quality of questions

In [13]:
data['Body_length'] = data['Body'].apply(len)
data['Title_length'] = data['Title'].apply(len)
In [14]:
data.head()
Out[14]:
Title Body Y Body_length Title_length
0 Java: Repeat Task Every Random Seconds I'm already familiar with repeating tasks ever... 0 304 38
1 Why are Java Optionals immutable? I'd like to understand why Java 8 Optionals we... 2 104 33
2 Text Overlay Image with Darkened Opacity React... I am attempting to overlay a title over an ima... 2 4805 53
3 Why ternary operator in swift is so picky? The question is very simple, but I just could ... 2 665 42
4 hide/show fab with scale animation I'm using custom floatingactionmenu. I need to... 2 375 34
In [15]:
import seaborn as sns
emptygrid = sns.FacetGrid(data,col='Y',size=3)
emptygrid.map(plt.hist,'Body_length')
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/seaborn/axisgrid.py:243: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
Out[15]:
<seaborn.axisgrid.FacetGrid at 0x11bf99a00>
In [13]:
data.corr()
Out[13]:
Y Body_length Title_length
Y 1.000000 0.090158 0.071399
Body_length 0.090158 1.000000 0.068254
Title_length 0.071399 0.068254 1.000000
In [14]:
emptygrid1 = sns.FacetGrid(data,col='Y',size=3)
emptygrid1.map(plt.hist, 'Title_length')
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/seaborn/axisgrid.py:243: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
Out[14]:
<seaborn.axisgrid.FacetGrid at 0x124b2cd00>

According to both the graph and the correlation table, the length of the body or title has little relationship with the quality of the question

In [17]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^(a-zA-Z)\s]','', text)
    return text
data['Body'] = data['Body'].apply(clean_text)
data.head()
Out[17]:
Title Body Y Body_length Title_length
0 Java: Repeat Task Every Random Seconds im already familiar with repeating tasks every... 0 304 38
1 Why are Java Optionals immutable? id like to understand why java optionals were... 2 104 33
2 Text Overlay Image with Darkened Opacity React... i am attempting to overlay a title over an ima... 2 4805 53
3 Why ternary operator in swift is so picky? the question is very simple but i just could n... 2 665 42
4 hide/show fab with scale animation im using custom floatingactionmenu i need to i... 2 375 34
In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_withstop = vectorizer.fit_transform(data['Body'])
y_withstop = data['Y'].values
In [18]:
from sklearn.model_selection import train_test_split

X_train_withstop, X_test_withstop, y_train_withstop, y_test_withstop = \
        train_test_split(X_withstop, y_withstop, test_size=0.2, 
                         shuffle=True, random_state=123, stratify=y_withstop)

fit models without removing stopwords

In [84]:
# with stopwords
from sklearn.ensemble import RandomForestClassifier
forest1 = RandomForestClassifier(n_estimators=100,
                                random_state=123,
                                max_depth = 50)
forest1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {forest1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {forest1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
Train Accuracy: 93.644%
Test Accuracy: 75.817%
In [263]:
from xgboost import XGBClassifier
xg_classifier1 = XGBClassifier(random_state = 123)
xg_classifier1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {xg_classifier1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {xg_classifier1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
[00:21:18] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Train Accuracy: 88.990%
Test Accuracy: 82.267%
In [264]:
from catboost import CatBoostClassifier


boost1 = CatBoostClassifier(verbose=0)

boost1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {boost1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {boost1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
Train Accuracy: 84.867%
Test Accuracy: 82.283%

Remove stopwords

In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ruohezhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [16]:
data1 = copy.deepcopy(data)
In [17]:
stop_words = set(stopwords.words('english')) 
punctuation = set(string.punctuation)
#only run one time, took 15 minutes
#for i in range(len(df)):
    #w = data1.iloc[i, 1].split()
    #resultwords = [word for word in w if word.lower() not in stop_words]
    #data1.iloc[i, 1] = ' '.join(resultwords)

Split dataset

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clean(data1)['Body'])
y = data1['Y'].values
In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, 
                         shuffle=True, random_state=123, stratify=y)
In [26]:
X_train_title, X_test_title, y_train_title, y_test_title = \
        train_test_split(X, y, test_size=0.2, 
                         shuffle=True, random_state=123, stratify=y)

Fit in models

In [33]:
# without stopwords
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(f"Train Accuracy: {knn.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {knn.score(X_test, y_test)*100:0.3f}%")
Train Accuracy: 69.248%
Test Accuracy: 55.383%
In [32]:
# without stopwords
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100,
                                random_state=123,
                                max_depth = 50)
forest.fit(X_train, y_train)
print(f"Train Accuracy: {forest.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {forest.score(X_test, y_test)*100:0.3f}%")
Train Accuracy: 90.271%
Test Accuracy: 77.033%
In [27]:
from xgboost import XGBClassifier
xg_classifier = XGBClassifier(random_state = 123)
xg_classifier.fit(X_train, y_train)
print(f"Train Accuracy: {xg_classifier.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {xg_classifier.score(X_test, y_test)*100:0.3f}%")
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/xgboost/sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:37:54] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Train Accuracy: 86.998%
Test Accuracy: 81.017%
In [59]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0, random_state = 123)

boost.fit(X_train, y_train)
print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")
Train Accuracy: 83.921%
Test Accuracy: 81.467%
In [28]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 123)
lr.fit(X_train, y_train)
print(f"Train Accuracy: {lr.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {lr.score(X_test, y_test)*100:0.3f}%")
Train Accuracy: 91.629%
Test Accuracy: 81.983%
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Try Logistic Regression!!!

试一下neural network

In [ ]:
# 在titles上run各种模型
In [58]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

boost.fit(X_train_title, y_train_title)
print(f"Train Accuracy: {boost.score(X_train_title, y_train_title)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test_title, y_test_title)*100:0.3f}%")
#title cannot be used for predicting.
Train Accuracy: 65.654%
Test Accuracy: 60.558%
In [20]:
list_corpus = data["Body"].tolist()
list_labels = data["Y"].tolist()
X_train_lime, X_test_lime, y_train_lime, y_test_lime = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

train_vectors = vectorizer.fit_transform(X_train_lime)
test_vectors = vectorizer.transform(X_test_lime)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train_lime)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test_lime, pred)
precision = precision_score(y_test_lime, pred, average='weighted')
recall = recall_score(y_test_lime, pred, average='weighted')
f1 = f1_score(y_test_lime, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
accuracy = 0.824, precision = 0.824, recall = 0.824, f1 = 0.824
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [22]:
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
c = make_pipeline(vectorizer, logreg)
class_names=list(data['Y'].unique())
explainer = LimeTextExplainer(class_names=class_names)
In [26]:
from xgboost import XGBClassifier
xg = XGBClassifier(random_state = 123)
xg.fit(train_vectors, y_train_lime)
d = make_pipeline(vectorizer, xg)
class_names=list(data['Y'].unique())
explainer_d = LimeTextExplainer(class_names=class_names)
/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/xgboost/sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
[22:07:49] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
In [28]:
from sklearn.ensemble import RandomForestClassifier
fo = RandomForestClassifier(n_estimators=100,
                                random_state=123,
                                max_depth = 50)
e = make_pipeline(vectorizer, xg)
class_names=list(data['Y'].unique())
explainer_e = LimeTextExplainer(class_names=class_names)
In [31]:
for idx in range(3):
    exp = explainer_e.explain_instance(X_test_lime[idx], e.predict_proba, num_features=6)
    exp.show_in_notebook()
In [32]:
for idx in range(3):
    exp = explainer_d.explain_instance(X_test_lime[idx], d.predict_proba, num_features=6)
    exp.show_in_notebook()
In [33]:
#High_quality : 2
for idx in range(3):
    exp = explainer.explain_instance(X_test_lime[idx], c.predict_proba, num_features=6)
    exp.show_in_notebook()

build confusion matrix & mcnemar's test

In [38]:
from mlxtend.evaluate import mcnemar
from mlxtend.evaluate import mcnemar_table

tb = mcnemar_table(y_target=y_train, 
                   y_model1=xg_classifier.predict(X_train), 
                   y_model2=lr.predict(X_train))

tb_test = mcnemar_table(y_target=y_test, 
                   y_model1=xg_classifier.predict(X_test), 
                   y_model2=lr.predict(X_test))

from mlxtend.plotting import checkerboard_plot
import matplotlib.pyplot as plt

brd = checkerboard_plot(tb,
                        figsize=(6, 6),
                        fmt='%d',
                        col_labels=['model 2 correct (train)', 'model 2 wrong (train)'],
                        row_labels=['model 1 correct (train)', 'model 1 wrong (train)'])
brd_test = checkerboard_plot(tb_test,
                        figsize=(6, 6),
                        fmt='%d',
                        col_labels=['model 2 correct (test)', 'model 2 wrong (test)'],
                        row_labels=['model 1 correct (test)', 'model 1 wrong (test)'])



plt.show()
In [36]:
chi2, p = mcnemar(ary=tb, corrected=True)
print('chi-squared-for-train:', chi2)
print('p-value-for-train:', p)
chi2, p = mcnemar(ary=tb_test, corrected=True)
print('chi-squared-for-test:', chi2)
print('p-value-for-test:', p)
chi-squared-for-train: 888.4801151700558
p-value-for-train: 3.1341741004492237e-195
chi-squared-for-test: 7.715869311551925
p-value-for-test: 0.005473749531437572
In [41]:
from mlxtend.evaluate import confusion_matrix
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix

cm = confusion_matrix(y_target=y_test, 
                      y_predicted=xg_classifier.predict(X_test), 
                      binary=True, 
                      positive_label=1)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()
In [42]:
cm_lr= confusion_matrix(y_target=y_test, 
                      y_predicted=lr.predict(X_test), 
                      binary=True, 
                      positive_label=1)
fig, ax = plot_confusion_matrix(conf_mat=cm_lr)
plt.show()